data$age_range[data$age < 40] <- "Under 40"
data$age_range[data$age > 39 & data$age < 60] <- "40s and 50s"
data$age_range[data$age  > 59 & data$age <80] <- "60s and 70s"
data$age_range[data$age  > 79] <- "More than 80"
#Logged national, cumulative fatalities
data$log_nat_fatal_cumulat <- log(data$nat_fatal_cumulat + 1)
################################################################################
# 2 - Descriptive Statistics (Appendix)
################################################################################
#descriptive data object
descr <- data %>% dplyr::select(tone, geo_fatal, geo_fatal_cumulat,  mil,
combat, dwnom1, dwnom2, woman, age, pip, obama,
senate, word_count, word_count2, months)
#Table A13 (basic descriptives)
stargazer(descr, type = 'text')
# Table A4: War Speeches by Year (Table A4)
descr_year <- as.data.frame(table(data$year))
descr_year <- descr_year %>% mutate(Perc = Freq / 36456)
descr_year
# Table A5: War Speeches by Chamber
descr_chamber <- as.data.frame(table(data$chamber))
descr_chamber <- descr_chamber %>% mutate(Perc = Freq / 36456)
descr_chamber
# Table A6: War Speeches by Party
descr_party <- as.data.frame(table(data$party))
descr_party <- descr_party %>% mutate(Perc = Freq / 36456)
descr_party
# Table A7: War Speeces by Military Experience
descr_mil <- as.data.frame(table(data$mil))
descr_mil <- descr_mil %>% mutate(Perc = Freq / 36456)
descr_mil
# Table A8: War Speeces by Combat Experience
data$milcomb <- data$mil
data$milcomb[data$combat==1] <- 2
descr_mil2 <- as.data.frame(table(data$milcomb))
descr_mil2 <- descr_mil2 %>% mutate(Perc = Freq / 36456)
descr_mil2
# Table A9 : War Speeches by Age
descr_age <- as.data.frame(table(data$age_range))
descr_age <- descr_age %>% mutate(Perc = Freq / 36456)
descr_age
#Individual-level data
data_ind <- data %>% dplyr::select(icpsr, state, mil, party)
data_ind <- data_ind %>% distinct()
# Table A11: Legislators Delivering War Speeches by Party
table(data_ind$party)
# Table A12: Legislators Delivering War Speeches by Military ex.
table(data_ind$mil)
#Table A10: Legislators Delivering War Speeches by Year
data_ind_year <- data %>% dplyr::select(icpsr,year, chamber) %>% distinct()
table(data_ind_year$year, data_ind_year$chamber)
#### Military Experience Descriptives in Appendix A
meta <- read.delim("fullmeta_vet.txt", header = TRUE, sep = "|", quote = "")
meta <- dplyr::filter(meta, cong>=107 & cong<=113)
meta$milcat<-"Missing"
meta$milcat[meta$military_collapse==0]<-"Non-Veteran"
meta$milcat[meta$military_collapse==1]<-"Veteran"
meta$milcat[meta$military_collapse==2]<-"Veteran"
meta$combatcat<-"No Combat Experience"
meta$combatcat[meta$combat==1]<-"Combat Experience"
meta$combatcat <- as.factor(meta$combatcat)
milcat_dist <- meta %>% group_by(cong) %>% count(milcat, combatcat)
milcat_dist$milcombat_cat <- ifelse(milcat_dist$milcat == "Non-Veteran" &
milcat_dist$combatcat == "No Combat Experience", "Non-Veteran",
ifelse(milcat_dist$milcat == "Veteran" &
milcat_dist$combatcat == "No Combat Experience", "Non-Combat Veteran",
ifelse(milcat_dist$milcat == "Veteran" &
milcat_dist$combatcat == "Combat Experience", "Combat Veteran", NA)))
milcat_dist$milcombat_cat <- as.factor(milcat_dist$milcombat_cat)
levels(milcat_dist$milcombat_cat)
milcat_dist$milcombat_cat <- factor(milcat_dist$milcombat_cat ,
levels=c('Non-Veteran',
'Non-Combat Veteran', 'Combat Veteran' ))
####Figure A1
ggplot(milcat_dist) +
geom_bar(aes(fill = milcombat_cat, y = n, x = as.factor(cong)),
position="fill", stat="identity",color="black",size=.4) +
xlab("Session") + ylab("Percent of Serving Members") +
scale_fill_manual(values=c("#762a83", "#d9ef8b", "olivedrab4")) +
scale_x_discrete(breaks=c("107","108","109","110","111","112", "113", "114"),
labels=c("107","108","109","110","111","112", "113", "114")) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank()) + labs(fill = "")
#Branch
table(meta$cong, meta$branch, useNA = "always")
branc_dat <- setNames(data.frame(matrix(ncol = 3, nrow = 49)), c("cong", "branchcat", "n"))
# army 1
branc_dat[1,] <- c(107, "army", 25)
branc_dat[2,] <- c(108, "army", 28)
branc_dat[3,] <- c(109, "army", 26)
branc_dat[4,] <- c(110, "army", 25)
branc_dat[5,] <- c(111, "army", 31)
branc_dat[6,] <- c(112, "army", 28)
branc_dat[7,] <- c(113, "army", 23+3)
# marine corps 2
branc_dat[8,] <- c(107, "marine corps", 3)
branc_dat[9,] <- c(108, "marine corps", 5)
branc_dat[10,] <- c(109, "marine corps", 3)
branc_dat[11,] <- c(110, "marine corps", 3+1)
branc_dat[12,] <- c(111, "marine corps", 5+1)
branc_dat[13,] <- c(112, "marine corps", 9+1+1)
branc_dat[14,] <- c(113, "marine corps", 8+1)
# navy 3
branc_dat[15,] <- c(107, "navy", 2)
branc_dat[16,] <- c(108, "navy", 3)
branc_dat[17,] <- c(109, "navy", 3)
branc_dat[18,] <- c(110, "navy", 4+1)
branc_dat[19,] <- c(111, "navy", 9+1)
branc_dat[20,] <- c(112, "navy", 8+1+1)
branc_dat[21,] <- c(113, "navy", 8+1+1+1)
# air force 4
branc_dat[22,] <- c(107, "air force", 10)
branc_dat[23,] <- c(108, "air force", 11)
branc_dat[24,] <- c(109, "air force", 11)
branc_dat[25,] <- c(110, "air force", 11)
branc_dat[26,] <- c(111, "air force", 10)
branc_dat[27,] <- c(112, "air force", 14+1)
branc_dat[28,] <- c(113, "air force", 13+1+1)
# coast guard 5
branc_dat[29,] <- c(107, "coast guard", 1)
branc_dat[30,] <- c(108, "coast guard", 1)
branc_dat[31,] <- c(109, "coast guard", 1)
branc_dat[32,] <- c(110, "coast guard", 1)
branc_dat[33,] <- c(111, "coast guard", 1)
branc_dat[34,] <- c(112, "coast guard", 0)
branc_dat[35,] <- c(113, "coast guard", 0)
# national guard 6
branc_dat[36,] <- c(107, "national guard", 6)
branc_dat[37,] <- c(108, "national guard", 6)
branc_dat[38,] <- c(109, "national guard", 6)
branc_dat[39,] <- c(110, "national guard", 4)
branc_dat[40,] <- c(111, "national guard", 3)
branc_dat[41,] <- c(112, "national guard", 4)
branc_dat[42,] <- c(113, "national guard", 5+3+1)
branc_dat$branchcat <- as.factor(branc_dat$branchcat)
levels(branc_dat$branchcat)
branc_dat$branchcat  <- factor(branc_dat$branchcat ,
levels=c('army', 'air force','national guard',
'marine corps','navy','coast guard'))
branc_dat$n <- as.integer(branc_dat$n)
branc_dat$cong <- as.integer(branc_dat$cong)
branc_dat_sub <- branc_dat %>% filter(branchcat!= "no service")
####Figure A2
ggplot(branc_dat_sub) +
geom_bar(aes(fill = branchcat, y = n, x = as.factor(cong)),position="fill",
stat="identity",color="black",size=.4) +
xlab("Session") + ylab("Percent of Serving Members") +
scale_fill_manual(values=c("#d9ef8b","#8ba7ef","#a7ef8b", "#efd38b","#8bd9ef","#8fb118")) +
scale_x_discrete(breaks=c("107","108","109","110","111","112", "113", "114"),
labels=c("107","108","109","110","111","112", "113", "114")) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank()) + labs(fill = "")
################################################################################
# 3 - Descriptive Visualizations of Tone
################################################################################
########
# Figure 1: Congressional Sentiment in Wartime Speeches
########
# Prep data
data2 <- data %>%
group_by(month = lubridate::floor_date(date, "month")) %>%
dplyr::summarize(avg_tone = mean(tone, na.rm=TRUE), n=n())
pdf("plots/fig1.pdf",height=6.1,width=6.6)
par(mar=c(4,4,3,3),  oma = c(0,0,0,0),mfrow=c(1,1))
plot(data2$month,data2$avg_tone,xlab="Date",
ylab="Average Tone of Reference to Iraq/Afg.",col="white",
ylim=c(min(data2$avg_tone),max(data2$avg_tone)))
abline(v=as.Date("2003-03-20"),col="gray30",lty=2)
text(as.Date("2003-03-20"),1.27,"Invasion of Iraq",pos=2,srt=90,col="gray30",cex=.75)
abline(v=as.Date("2004-04-27"),col="gray30",lty=2)
text(as.Date("2004-04-27"),1.27,"Abu Ghraib Report",pos=2,srt=90,col="gray30",cex=.75)
abline(v=as.Date("2009-01-20"),col="gray30",lty=2)
text(as.Date("2009-01-20"),1.27,"Obama Inauguration",pos=2,srt=90,col="gray30",cex=.75)
abline(v=as.Date("2007-02-16"),col="gray30",lty=2)
text(as.Date("2007-02-16"),1.27,"Troop Surge Debate",pos=2,srt=90,col="gray30",cex=.75)
abline(v=as.Date("2013-12-30"),col="gray30",lty=2)
text(as.Date("2013-12-30"),1.27,"ISIS Captures Fallujah",pos=2,srt=90,col="gray30",cex=.75)
points(data2$month,data2$avg_tone,
bg=alpha("steelblue",.4), col=alpha("steelblue",1),lwd=1,cex=data2$n/200, pch=21 )
lines(lowess(data2$avg_tone~data2$month,f=.2),col="orangered",lwd=3)
dev.off()
########
# Figure A3 Congressional Sentiment by Party
########
data_rep <- data %>% filter(rep == 1) %>%
group_by(month = lubridate::floor_date(date, "month")) %>%
dplyr::summarize(avg_tone = mean(tone, na.rm=TRUE), n=n())
data_dem <- data %>% filter(dem == 1) %>%
group_by(month = lubridate::floor_date(date, "month")) %>%
dplyr::summarize(avg_tone = mean(tone, na.rm=TRUE), n=n())
pdf("plots/figA3.pdf",height=6.1,width=6.6)
par(mar=c(4,4,3,3),  oma = c(0,0,0,0),mfrow=c(1,1))
plot(data_rep$month,data_rep$avg_tone,xlab="Date",
ylab="Average Tone of Reference to Iraq/Afg.",col="white",
ylim=c(min(data_rep$avg_tone),max(data_dem$avg_tone)))
abline(v=as.Date("2003-03-20"),col="gray30",lty=2)
text(as.Date("2003-03-20"),1.5,"Invasion of Iraq",pos=2,srt=90,col="gray30",cex=.75)
abline(v=as.Date("2004-04-27"),col="gray30",lty=2)
text(as.Date("2004-04-27"),1.5,"Abu Ghraib Report",pos=2,srt=90,col="gray30",cex=.75)
abline(v=as.Date("2009-01-20"),col="gray30",lty=2)
text(as.Date("2009-01-20"),1.5,"Obama Inauguration",pos=2,srt=90,col="gray30",cex=.75)
abline(v=as.Date("2007-02-16"),col="gray30",lty=2)
text(as.Date("2007-02-16"),1.5,"Troop Surge Debate",pos=2,srt=90,col="gray30",cex=.75)
abline(v=as.Date("2013-12-30"),col="gray30",lty=2)
text(as.Date("2013-12-30"),1.5,"ISIS Captures Fallujah",pos=2,srt=90,col="gray30",cex=.75)
points(data_rep$month,data_rep$avg_tone,
bg=alpha("firebrick2",.4), col=alpha("firebrick2",1),lwd=1,cex=data_rep$n/200, pch=21 )
lines(lowess(data_rep$avg_tone~data_rep$month,f=.1),col="firebrick4",lwd=3)
points(data_dem$month,data_dem$avg_tone,
bg=alpha("dodgerblue",.4), col=alpha("dodgerblue",1),lwd=1,cex=data_dem$n/200, pch=21 )
lines(lowess(data_dem$avg_tone~data_dem$month,f=.1),col="dodgerblue3",lwd=3)
dev.off()
########
# Figure 3: Average Speaker Tone in Wartime References by Ideology and Veteran Status
########
# Weighting the mean
data2 <- data %>%
group_by(speakerid) %>%
mutate(tone = mean(tone,na.rm=T),
mil = max(mil,na.rm=T),
cas = sum(geo_fatal,na.rm=T),
dwnom1 = mean(dwnom1, na.rm=T),
rep = max(rep, na.rm=T),
n = n()) %>%
dplyr::select(speakerid, state, tone, rep, mil, cas, dwnom1, n) %>%
unique()
data2$tone_mean <- mean(data$tone,na.rm=T)
data2$tone_var <- var(data$tone,na.rm=T)
data2$tone_speaker_var <- var(data2$tone,na.rm=T)
data2$tone_weighted <- ((data2$n/data2$tone_var)*(data2$tone) + (1/data2$tone_speaker_var)*(data2$tone_mean)) /
((data2$n/data2$tone_var) +  (1/data2$tone_speaker_var))
pdf("plots/fig3.pdf",height=7,width=7.2)
par(mar=c(4,4,3,3),  oma = c(0,0,0,0),mfrow=c(1,1))
plot(NULL,# create empty plot
xlim = c(-.75,1),
ylim = c(-1.75,1.75),
axes = F, xlab = NA, ylab = NA)
grid()
box()
rep_mean <- mean(data2$tone_weighted[data2$rep==1],na.rm=T)
dem_mean <- mean(data2$tone_weighted[data2$rep==0],na.rm=T)
points(data2$dwnom1[data2$mil==0 & data2$rep==1],
data2$tone_weighted[data2$mil==0 & data2$rep==1],
bg=alpha("#a50026",.4), col=alpha("#a50026",.4),lwd=1,cex=.6+(data2$n[data2$mil==0 & data2$rep==1]/100), pch=21 )
points(data2$dwnom1[data2$mil==0 & data2$rep==0],
data2$tone_weighted[data2$mil==0 & data2$rep==0],
bg=alpha("#313695",.4), col=alpha("#313695",.4),lwd=1,cex=.6+(data2$n[data2$mil==0 & data2$rep==0]/100), pch=21 )
points(data2$dwnom1[data2$mil==1],data2$tone_weighted[data2$mil==1],bg=alpha("olivedrab3",.8), col=alpha("olivedrab4",1),lwd=1.4,cex=.6+(data2$n[data2$mil==1]/100), pch=21 )
axis(1)
axis(2)
mtext(side=1,"Speaker DW-Nominate Score",line=2)
mtext(side=2,"Average Tone of Reference to Iraq or Afghanistan",line=2.5, cex=1.1)
box()
legend(0.63,-1.32, legend = c("1 Reference","50 References","250 References"), pch = 21,
pt.cex = c(.61,1.1,3.1),
x.intersp = 1.4, y.intersp = 1.4, cex=.7,
bty = "n", col = "black", pt.bg=alpha("gray80", 0.5))
dev.off()
########
# Figure 2: Top and Bottom 10 Average Tone of Speakers
########
# Taking Mean among speakers w/ >25 speeches
data2 <- data %>%
group_by(speakerid) %>%
mutate(tone = mean(tone,na.rm=T),
mil = max(mil,na.rm=T),
cas = sum(geo_fatal,na.rm=T),
dwnom1 = mean(dwnom1, na.rm=T),
rep = max(rep, na.rm=T),
n = n()) %>%
filter(n > 25) %>%
dplyr::select(speaker, speakerid, state, tone, rep, dwnom1, n, auth_yea, auth_nay) %>%
unique()
# Weighting the mean
data2$tone_mean <- mean(data$tone,na.rm=T)
data2$tone_var <- var(data$tone,na.rm=T)
data2$tone_speaker_var <- var(data2$tone,na.rm=T)
data2$tone_weighted <- ((data2$n/data2$tone_var)*(data2$tone) + (1/data2$tone_speaker_var)*(data2$tone_mean)) /
((data2$n/data2$tone_var) +  (1/data2$tone_speaker_var))
data2 <- arrange(data2, tone_weighted)
#Removing duplicates because of name misspelling
data3 <- data2 %>%
dplyr::select(speakerid, tone_weighted, n) %>%
unique()
tail(data2, 20) # Lists names
head(data2, 20) # Lists names
neg_names <- c(  "John Lewis (D-GA)",
"Cynthia McKinney (D-GA)",
"Rand Paul (R-KY)",
"Ron Paul (R-TX)",
"Bernie Sanders (I-VT)",
"Maurice Hinchey (D-NY)",
"Jimmy Duncan (R-TN)",
"Byron Dorgan (D-ND)",
"Jim McDermott (D-WA)",
"Ted Poe (R-TX)")
pos_names <- c( "Millender-McDonald (D-CA)",
"Johnny Isakson (R-GA)",
"Mark Pryor (D-AR)",
"Henry Hyde (R-IL)",
"Richard Lugar (R-IN)",
"Michael Bennet (D-CO)",
"Tom Cole (R-OK)",
"John Warner (R-VA)",
"Nita Lowey (D-NY)",
"Robin Hayes (R-NC)")
pos <- tail(data3$tone_weighted, 10)
neg <- head(data3$tone_weighted, 10)
pdf("plots/fig2.pdf",height=7,width=7.5)
par(mar=c(5,13,3,3),  oma = c(0,0,0,0),mfrow=c(1,1))
barplot(c(neg,pos),horiz=T,names=c(neg_names,pos_names),las=1,
col=c("#3288bd","#3288bd","#d53e4f","#d53e4f","#D3D3D3",
"#3288bd","#d53e4f","#3288bd","#3288bd","#d53e4f",
"#3288bd","#d53e4f","#3288bd","#d53e4f","#d53e4f",
"#3288bd","#d53e4f","#d53e4f","#3288bd","#d53e4f"),
xlab="Average Tone (Weighted)",xlim=c(-1.25,1.25))
legend("bottomright", legend = c("Republican", "Democrat", "Independent"),
fill = c("#d53e4f", "#3288bd", "#D3D3D3"), bty = "n")
dev.off()
################################################################################
# 4 - Regression Analysis
################################################################################
########
# Table 2: Veteran Status and Semantic Tone When Referencing Iraq or Afg.
########
m1 <- lmer(tone ~ mil + (1 | speakerid),  data=data)
m2 <- lmer(tone ~ mil + rep + (1 | speakerid),  data=data)
m3 <- lmer(tone ~ mil + rep + woman + age +
pip + obama + senate +
word_count + word_count2 + as.factor(date_month2)  + (1 | speakerid),  data = data)
m4 <- lmer(tone ~ mil + rep + mil_rep + pip + obama + senate + woman + age +
word_count + word_count2 + as.factor(date_month2)  + (1 | speakerid),  data = data)
stargazer(m1, m2, m3, m4,
keep = c('mil', 'rep', 'mil_rep', 'pip', 'obama', 'senate', 'woman',
'age', 'geo_fatal_cumulat',
'word_count', 'word_count2'), type = 'text')
########
# Table 3 (Table A24): Evidence of Casualty Sensitivity Among Veterans and Non-veterans
#          Random Effects Specifications
########
m1 <- lmer(tone ~ mil*geo_fatal + rep + as.factor(date_month2) + (1| speakerid),  data=data)
m2 <- lmer(tone ~ mil*geo_fatal + rep + woman + age +
pip + obama + senate +
word_count + word_count2  + as.factor(date_month2) +  (1| speakerid),  data = data)
m3 <- lmer(tone ~ mil*geo_fatal_cumulat + rep + as.factor(date_month2) + (1| speakerid),  data=data)
m4 <- lmer(tone ~ mil*geo_fatal_cumulat + rep + woman + age +
pip + obama + senate +
word_count + word_count2  + as.factor(date_month2) +  (1| speakerid),  data = data)
stargazer(m1, m2, m3, m4,
keep = c('mil','geo_fatal', 'geo_fatal_cumulat','nat_fatal_month','dwnom1','dwnom2','rep',
'woman', 'age', 'pip', 'obama', 'senate'), type="text")
m1 <- lm(tone ~ nat_fatal_month +  word_count + word_count2 + months + as.factor(speakerid),data=subset(data,mil==0))
m2 <- lm(tone ~ log_nat_fatal_cumulat + word_count + word_count2 + months + as.factor(speakerid),data=subset(data,mil==0))
m3 <- lm(tone ~ nat_fatal_month + word_count + word_count2 + months + as.factor(speakerid),data=subset(data,mil==1))
m4 <- lm(tone ~ log_nat_fatal_cumulat + word_count + word_count2 + months + as.factor(speakerid),data=subset(data,mil==1))
stargazer(m1, m2, m3, m4, type  = 'text',
keep = c('nat_fatal_month', 'nat_fatal_cumulat',
'log_nat_fatal_cumulat', 'log_nat_fatal',
'word_count','word_count2','months'))
####
# Author: M. Kenwick, S. Lee, B. Kolcak
# Purpose: Fightin' Words analysis
# Date: May 19, 2025
####
rm(list=ls())
library(lme4)
library(dplyr)
library(ggplot2)
library(tidyr)
library(ggrepel)
library(quanteda)
library(stringr)
setwd('~/Dropbox/cmr_cong/replication/')
################################################################################
# 0 - Load functions from Monroe
# From https://burtmonroe.github.io/TextAsDataCourse/Tutorials/TADA-FightinWords.nb.html
################################################################################
fwgroups <- function(dtm, groups, pair = NULL, weights = rep(1,nrow(dtm)), k.prior = .1) {
weights[is.na(weights)] <- 0
weights <- weights/mean(weights)
zero.doc <- rowSums(dtm)==0 | weights==0
zero.term <- colSums(dtm[!zero.doc,])==0
dtm.nz <- apply(dtm[!zero.doc,!zero.term],2,"*", weights[!zero.doc])
g.prior <- tcrossprod(rowSums(dtm.nz),colSums(dtm.nz))/sum(dtm.nz)
#
g.posterior <- as.matrix(dtm.nz + k.prior*g.prior)
groups <- groups[!zero.doc]
groups <- droplevels(groups)
g.adtm <- as.matrix(aggregate(x=g.posterior,by=list(groups=groups),FUN=sum)[,-1])
rownames(g.adtm) <- levels(groups)
g.ladtm <- log(g.adtm)
g.delta <- t(scale( t(scale(g.ladtm, center=T, scale=F)), center=T, scale=F))
g.adtm_w <- -sweep(g.adtm,1,rowSums(g.adtm)) # terms not w spoken by k
g.adtm_k <- -sweep(g.adtm,2,colSums(g.adtm)) # w spoken by groups other than k
g.adtm_kw <- sum(g.adtm) - g.adtm_w - g.adtm_k - g.adtm # total terms not w or k
g.se <- sqrt(1/g.adtm + 1/g.adtm_w + 1/g.adtm_k + 1/g.adtm_kw)
g.zeta <- g.delta/g.se
g.counts <- as.matrix(aggregate(x=dtm.nz, by = list(groups=groups), FUN=sum)[,-1])
if (!is.null(pair)) {
pr.delta <- t(scale( t(scale(g.ladtm[pair,], center = T, scale =F)), center=T, scale=F))
pr.adtm_w <- -sweep(g.adtm[pair,],1,rowSums(g.adtm[pair,]))
pr.adtm_k <- -sweep(g.adtm[pair,],2,colSums(g.adtm[pair,])) # w spoken by groups other than k
pr.adtm_kw <- sum(g.adtm[pair,]) - pr.adtm_w - pr.adtm_k - g.adtm[pair,] # total terms not w or k
pr.se <- sqrt(1/g.adtm[pair,] + 1/pr.adtm_w + 1/pr.adtm_k + 1/pr.adtm_kw)
pr.zeta <- pr.delta/pr.se
return(list(zeta=pr.zeta[1,], delta=pr.delta[1,],se=pr.se[1,], counts = colSums(dtm.nz), acounts = colSums(g.adtm)))
} else {
return(list(zeta=g.zeta,delta=g.delta,se=g.se,counts=g.counts,acounts=g.adtm))
}
}
# Plotting function
makeTransparent<-function(someColor, alpha=100)
{
newColor<-col2rgb(someColor)
apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}
fw.ggplot.groups <- function(fw.ch, groups.use = as.factor(rownames(fw.ch$zeta)), max.words = 50, max.countrank = 400, colorpalette=rep("black",length(groups.use)), sizescale=2, title="Comparison of Terms by Groups", subtitle = "", caption = "Group-specific terms are ordered by Fightin' Words statistic (Monroe, et al. 2008)") {
if (is.null(dim(fw.ch$zeta))) {## two-group fw object consists of vectors, not matrices
zetarankmat <- cbind(rank(-fw.ch$zeta),rank(fw.ch$zeta))
colnames(zetarankmat) <- groups.use
countrank <- rank(-(fw.ch$counts))
} else {
zetarankmat <- apply(-fw.ch$zeta[groups.use,],1,rank)
countrank <- rank(-colSums(fw.ch$counts))
}
wideplotmat <- as_tibble(cbind(zetarankmat,countrank=countrank))
wideplotmat$term=names(countrank)
#rankplot <- gather(wideplotmat, party, zetarank, 1:ncol(zetarankmat))
rankplot <- gather(wideplotmat, groups.use, zetarank, 1:ncol(zetarankmat))
rankplot$plotsize <- sizescale*(50/(rankplot$zetarank))^(1/4)
rankplot <- rankplot[rankplot$zetarank < max.words + 1 & rankplot$countrank<max.countrank+1,]
rankplot$groups.use <- factor(rankplot$groups.use,levels=groups.use)
p <- ggplot(rankplot, aes((nrow(rankplot)-countrank)^1, -(zetarank^1), colour=groups.use)) +
geom_point(show.legend=F,size=sizescale/2) +
theme_classic() +
theme(axis.ticks=element_blank(), axis.text=element_blank() ) +
ylim(-max.words,40) +
facet_grid(groups.use ~ .) +
geom_text_repel(aes(label = term), size = rankplot$plotsize, point.padding=.05,
box.padding = unit(0.20, "lines"), show.legend=F) +
scale_colour_manual(values = alpha(colorpalette, .7)) +
#    labs(x="Terms used more frequently overall →", y="Terms used more frequently by group →",  title=title, subtitle=subtitle , caption = caption)
labs(x=paste("Terms used more frequently overall -->"), y=paste("Terms used more frequently by group -->"),  title=title, subtitle=subtitle , caption = caption)
}
fw.keys <- function(fw.ch,n.keys=10) {
n.groups <- nrow(fw.ch$zeta)
keys <- matrix("",n.keys,n.groups)
colnames(keys) <- rownames(fw.ch$zeta)
for (g in 1:n.groups) {
keys[,g] <- names(sort(fw.ch$zeta[g,],dec=T)[1:n.keys])
}
keys
}
####
################################################################################
# 1 - Load data
################################################################################
#speaker data
meta <- read.csv('master.csv')
meta$mil <- ifelse(meta$military_collapse>0, 1,0)
meta <- meta %>%
dplyr::filter(keyword_count>=4)
# Load speeches
sp_107 <- read.delim('speech_data/speeches_107.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_108 <- read.delim('speech_data/speeches_108.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_109 <- read.delim('speech_data/speeches_109.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_110 <- read.delim('speech_data/speeches_110.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_111 <- read.delim('speech_data/speeches_111.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_112 <- read.delim('speech_data/speeches_112.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
sp_113 <- read.delim('speech_data/speeches_113.txt', header = TRUE, sep = "|",quote = "", stringsAsFactors = F, encoding="UTF-8")
# Subset to Irq/Afg Speeches
speeches <- unique(meta$speech_id)
sp_107_2 <- sp_107[sp_107$speech_id %in% speeches,]
sp_108_2 <- sp_108[sp_108$speech_id %in% speeches,]
sp_109_2 <- sp_109[sp_109$speech_id %in% speeches,]
sp_110_2 <- sp_110[sp_110$speech_id %in% speeches,]
sp_111_2 <- sp_111[sp_111$speech_id %in% speeches,]
sp_112_2 <- sp_112[sp_112$speech_id %in% speeches,]
sp_113_2 <- sp_113[sp_113$speech_id %in% speeches,]
rm(sp_107, sp_108, sp_109, sp_110, sp_111, sp_112, sp_113)
speeches<-rbind(sp_107_2, sp_108_2, sp_109_2, sp_110_2,
sp_111_2, sp_112_2, sp_113_2)
meta$party[meta$party=="I"] <- "D" # Lumping independents w/ Democrats
data <- merge(speeches, dplyr::select(meta,speech_id,party,speakerid,mil),
by =("speech_id"), all.x= TRUE, all.y = FALSE)
data<-tibble(data)
#Remove punctuation etc.
data$speech <- str_replace_all(data$speech, "[[:punct:]]", " ")
data$length <- nchar(data$speech)
#### Prepare Data for FW
corp <- corpus(data, text_field="speech", docid_field="speech_id")
tok <- tokens(corp, remove_numbers=T, remove_punct=T,
remove_symbols=T) %>%
tokens_remove(stopwords(source = "snowball")) %>%
tokens_wordstem() %>%
tokens_ngrams(1)
dfm.full <-  dfm(tok,  verbose=T,tolower = T)
dfmtrimmed <- dfm_trim(dfm.full, min_docfreq = 30, min_termfreq = 50, verbose = TRUE)
# Groups
data$groups<-"Other"
data$groups[data$mil==0 & data$party=="D" ]<-"Dem., Non-veteran"
data$groups[data$mil==1 & data$party=="D" ]<-"Dem., Veteran"
data$groups[data$mil==0 & data$party=="R" ]<-"Rep., Non-Veteran"
data$groups[data$mil==1 & data$party=="R" ]<-"Rep., Veteran"
# Generate Figure 4
fw_test2 <- fwgroups(dfmtrimmed,groups = as.factor(data$groups))
fwkeys_test2 <- fw.keys(fw_test2, n.keys=15)
plot_test2<-fw.ggplot.groups(fw_test2,sizescale=4,max.words=200,max.countrank=400, colorpalette=c("steelblue","darkolivegreen4","orangered","darkolivegreen4"))
pdf("plots/fig4.pdf",height=9,width=12)
plot_test2
dev.off()
